Table. eQTL Results


In [1]:
import copy
import cPickle
import os
import subprocess

import cdpybio as cpb
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns

import ciepy
import cardipspy as cpy

%matplotlib inline

dy_name = 'table_eqtl_results'
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)

In [2]:
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

In [3]:
fn = os.path.join(ciepy.root, 'misc', 'stem_cell_population_maintenance.tsv')
go_scpm = pd.read_table(fn, header=None)
go_genes = set(go_scpm[2]) & set(gene_info.gene_name)

In [4]:
def get_gene_id(x):
    return gene_info[gene_info.gene_name == x].index[0]

In [5]:
# Note: sometimes this cell fails. If I run several times eventually it works. Not sure 
# why Nature's URL is weird.
url = 'http://www.nature.com/nbt/journal/v33/n11/extref/nbt.3387-S5.xlsx'
scorecard = pd.read_excel(url)
scorecard = scorecard.drop(scorecard.columns[2:], axis=1)
scorecard = scorecard[scorecard.gene.apply(lambda x: x in gene_info.gene_name.values)]
scorecard.index = [get_gene_id(x) for x in scorecard.gene]
scorecard = scorecard.drop(['gene'], axis=1)
scorecard.columns = ['tsankov_class']

In [6]:
def annotate_eqtls(fn):
    df = pd.read_table(fn, index_col=0)
    if 'perm_sig' in df.columns:
        df = df[df.perm_sig]
    td = [x for x in df.columns if 'AF' in x]
    df = df.drop(td, axis=1)
    df = df.merge(scorecard, left_on='gene_id', right_index=True, how='left')
    tdf = pd.DataFrame(True, index=go_genes, columns=['go_stem_cell_population_maintenance'])
    df = df.merge(tdf, left_on='gene_name', right_index=True, how='left')
    df.ix[df.go_stem_cell_population_maintenance.isnull(), 'go_stem_cell_population_maintenance'] = False
    return df

In [7]:
writer = pd.ExcelWriter(os.path.join(outdir, 'table_eqtl_results.xlsx'))

In [8]:
for i in [1, 2, 3]:
    fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls0{}'.format(i), 'lead_variants.tsv')
    leads = annotate_eqtls(fn)
    leads.drop('genocnt', axis=1).to_excel(writer, 'leads0{}'.format(i))
    fn = os.path.join(ciepy.root, 'output', 'eqtl_processing', 'eqtls0{}'.format(i), 'gene_variant_pairs.tsv')
    all_eqtls = annotate_eqtls(fn)
    all_eqtls.drop('genocnt', axis=1).to_excel(writer, 'all0{}'.format(i))

In [9]:
writer.save()